From: Kentaro Hayashi Date: Sat, 19 Aug 2023 08:14:56 +0000 (+0100) Subject: Import sentencepiece_0.1.99-4.debian.tar.xz X-Git-Tag: archive/raspbian/0.2.0-1+rpi1^2^2~5^2 X-Git-Url: https://dgit.raspbian.org/%22http://www.example.com/cgi/%22/%22http:/www.example.com/cgi/%22?a=commitdiff_plain;h=70d968b3389e86268d736d0c96cc22b8563bffd8;p=sentencepiece.git Import sentencepiece_0.1.99-4.debian.tar.xz [dgit import tarball sentencepiece 0.1.99-4 sentencepiece_0.1.99-4.debian.tar.xz] --- 70d968b3389e86268d736d0c96cc22b8563bffd8 diff --git a/README.Debian b/README.Debian new file mode 100644 index 0000000..3aa4ad1 --- /dev/null +++ b/README.Debian @@ -0,0 +1,48 @@ +# senencepiece for Debian + +The record of sentencepiece package specific information. + +## trixie or later + +Removed already applied upsteam's patches which was introduced 0.1.97-3. + +## bookworm + +The upstream of sentencepiece 0.1.97 was initially released around June 6, 2022, +but it was pre-release version. +The official release version was shipped at Aug 7, 2022. + +Accidentally, pre-release version was packaged as 0.1.97-1. +Thus, some commits were not included into 0.1.97-1. + +To fix up this issue, commits since 5e5adf2f851a1514ccc435aae11ee830c438321b +were applied as the following patch files. + +See https://github.com/google/sentencepiece/issues/794 about detail. + +0001-update-python-wrapper.patch +0002-remove-debug-symbols-from-wheel-package.patch +0003-allow-tab-character-to-be-used-in-user_defined_symbo.patch +0004-add-test-to-use-tab-as-user-defined-symbols.patch +0005-Uses-C-17-by-default.patch +0006-Uses-std-atomic-to-define-global-variable.patch +0007-Fix-a-typo.patch +0008-Uses-absl-string_view-as-much-as-possible.patch +0009-Fixed-build-break.patch +0010-Added-ImmutableSentencePiece-class.patch +0011-add-verbose-option.patch +0012-Supports-ImmutableSentencePieceText-from-python-modu.patch +0013-Adds-more-unittests.patch +0014-Adds-SWIGPYTHON-flag.patch +0015-remove-unused-ifdef-SWIG-macro.patch +0016-Fixed-test-failure.patch +0017-Uses-property-in-immutable-proto.patch +0018-automatically-detect-the-number-of-CPUs-in-batch-pro.patch +0019-support-slice-in-pieces-nbests-objects.patch +0020-Updated-the-document.patch +0021-Fixed-errors-in-example-notebook.patch +0022-Fix-dead-links.patch +0023-added-ShutdownLibrary-function-to-uninitialize-globa.patch +0024-Fixed-the-issue-of-concatinating-paths-for-pkg-confi.patch + + -- kenhys , Sat, 17 Jun 2023 23:16:19 +0900 diff --git a/changelog b/changelog new file mode 100644 index 0000000..d2ab754 --- /dev/null +++ b/changelog @@ -0,0 +1,221 @@ +sentencepiece (0.1.99-4) unstable; urgency=medium + + * debian/clean + - Fix FTBFS (double build) (Closes: #1047552) + + -- Kentaro Hayashi Sat, 19 Aug 2023 17:14:56 +0900 + +sentencepiece (0.1.99-3) unstable; urgency=medium + + * debian/tests/control + - Fix regression (preventing migration) about + python module's autopkgtest. + + -- Kentaro Hayashi Wed, 21 Jun 2023 14:44:27 +0900 + +sentencepiece (0.1.99-2) unstable; urgency=medium + + * debian/patches/fix-ftbfs-big-endian.patch + - Add patch to fix FTBFS on big endian platform. + * debian/tests/control + - Fix W: illegal-runtime-test-name warning + + -- Kentaro Hayashi Tue, 20 Jun 2023 19:28:50 +0900 + +sentencepiece (0.1.99-1) unstable; urgency=medium + + * New upstream version 0.1.99 + * debian/control + - Bump Standards-Version to 4.6.2. No other changes are required. + * debian/patches/disable-static-library.patch + debian/patches/support-python-module-in-place.patch + - Refresh patch files for 0.1.99 + * debian/patches/*.patch + - Drop deprecated patch files which was already applied in upstream. + * debian/README.Debian + - Update explanation of debian/patches. + + -- Kentaro Hayashi Sun, 18 Jun 2023 00:04:54 +0900 + +sentencepiece (0.1.97-3) unstable; urgency=medium + + * debian/patches/0001-update-python-wrapper.patch + debian/patches/0002-remove-debug-symbols-from-wheel-package.patch + debian/patches/0003-allow-tab-character-to-be-used-in-user_defined_symbo.patch + debian/patches/0004-add-test-to-use-tab-as-user-defined-symbols.patch + debian/patches/0005-Uses-C-17-by-default.patch + debian/patches/0006-Uses-std-atomic-to-define-global-variable.patch + debian/patches/0007-Fix-a-typo.patch + debian/patches/0008-Uses-absl-string_view-as-much-as-possible.patch + debian/patches/0009-Fixed-build-break.patch + debian/patches/0010-Added-ImmutableSentencePiece-class.patch + debian/patches/0011-add-verbose-option.patch + debian/patches/0012-Supports-ImmutableSentencePieceText-from-python-modu.patch + debian/patches/0013-Adds-more-unittests.patch + debian/patches/0014-Adds-SWIGPYTHON-flag.patch + debian/patches/0015-remove-unused-ifdef-SWIG-macro.patch + debian/patches/0016-Fixed-test-failure.patch + debian/patches/0017-Uses-property-in-immutable-proto.patch + debian/patches/0018-automatically-detect-the-number-of-CPUs-in-batch-pro.patch + debian/patches/0019-support-slice-in-pieces-nbests-objects.patch + debian/patches/0020-Updated-the-document.patch + debian/patches/0021-Fixed-errors-in-example-notebook.patch + debian/patches/0022-Fix-dead-links.patch + debian/patches/0023-added-ShutdownLibrary-function-to-uninitialize-globa.patch + debian/patches/0024-Fixed-the-issue-of-concatinating-paths-for-pkg-confi.patch + - Add missing patch files for 0.1.97. + * debian/README.Debian + - Add explanation of debian/patches. + + -- Kentaro Hayashi Mon, 21 Nov 2022 22:43:46 +0900 + +sentencepiece (0.1.97-2) unstable; urgency=medium + + * Team upload + + [ Steve Langasek ] + * debian/patches/header-dependencies.patch: include necessary headers + to ensure IS_BIG_ENDIAN is defined, see #1017360. + + -- Graham Inggs Sun, 18 Sep 2022 05:30:57 +0000 + +sentencepiece (0.1.97-1) unstable; urgency=medium + + * New upstream version 0.1.97 + * debian/copyright + - Update maintainer E-mail address + * debian/control + - Bump Standards-Version to 4.6.1. No other changes are required. + * debian/patches/support-python-module-in-place.patch + - Refresh path to build python module. + + -- Kentaro Hayashi Tue, 14 Jun 2022 20:19:58 +0900 + +sentencepiece (0.1.96-1) unstable; urgency=medium + + * New upstream version 0.1.96 + * debian/control + - Bump standard-version to 4.5.1. No changes are required. + + -- Kentaro Hayashi Wed, 18 Aug 2021 20:52:46 +0900 + +sentencepiece (0.1.95-1) unstable; urgency=medium + + * New upstream version 0.1.95 + * debian/patches/support-python-module-in-place.patch + - Fix undefined symbol when importing python module (Closes: #979040) + + -- Kentaro Hayashi Thu, 11 Feb 2021 17:36:23 +0900 + +sentencepiece (0.1.94-2) unstable; urgency=medium + + * Fix FTBFS on armel/mipsel (Closes: #977235) + + -- Kentaro Hayashi Wed, 16 Dec 2020 21:18:15 +0900 + +sentencepiece (0.1.94-1) unstable; urgency=medium + + * New upstream version 0.1.94 + * debian/patches/support-python-module-in-place.patch + - Refresh path to build python module. + * debian/patches/fix-ftbfs-ports.patch + debian/patches/mutiarch-support.patch + - Remove needless patch because these patch was merged + to google/sentencepiece. + + -- Kentaro Hayashi Wed, 28 Oct 2020 21:02:07 +0900 + +sentencepiece (0.1.93-1) unstable; urgency=medium + + * New upstream version 0.1.93 + * debian/source/lintian-overrides + - Remove needless override. + + -- Kentaro Hayashi Thu, 15 Oct 2020 21:32:05 +0900 + +sentencepiece (0.1.92-3) unstable; urgency=medium + + * debian/patches/fix-ftbfs-ports.patch + - Fix FTBFS on powerpc + + -- Kentaro Hayashi Sat, 03 Oct 2020 20:48:27 +0900 + +sentencepiece (0.1.92-2) unstable; urgency=medium + + * debian/patches/0002-Change-in-order-to-build-Python-modules-in-place.patch + - Fix FTBFS on hurd-i386 + * debian/patches/0004-Fix-FTBFS-on-armel-and-mipsel.patch + - Fix missing dependency to atomic library (powerpc,m68k,sh4) + + -- Kentaro Hayashi Sat, 26 Sep 2020 20:27:17 +0900 + +sentencepiece (0.1.92-1) unstable; urgency=medium + + * New upstream version 0.1.92 + + -- Kentaro Hayashi Fri, 19 Jun 2020 19:38:49 +0900 + +sentencepiece (0.1.91-1) unstable; urgency=medium + + * New upstream version 0.1.91 + + -- Kentaro Hayashi Fri, 22 May 2020 15:17:42 +0900 + +sentencepiece (0.1.90-3) unstable; urgency=medium + + * debian/patches/0004-Fix-FTBFS-on-armel-and-mipsel.patch + - Refresh patch to fix FTBFS. + + -- Kentaro Hayashi Sun, 17 May 2020 09:02:23 +0900 + +sentencepiece (0.1.90-2) unstable; urgency=medium + + * debian/patches/0004-Fix-FTBFS-on-armel-and-mipsel.patch + - Add patch to fix FTBFS on mipsel and armel + + -- Kentaro Hayashi Sat, 16 May 2020 16:16:45 +0900 + +sentencepiece (0.1.90-1) unstable; urgency=medium + + * New upstream version 0.1.90 + * debian/control + - Update Uploaders: + - Bump standard-version to 4.5.0 + - Bump compat version to 13. + * debian/source/lintian-overrides + - Fix false positive source-is-missing + * debian/patches/0003-Disable-static-library-explicitly.patch + - Disable to build static library + + -- Kentaro Hayashi Wed, 13 May 2020 19:09:34 +0900 + +sentencepiece (0.1.84-1) unstable; urgency=medium + + * New upstream version 0.1.84 (Closes: #939860) + + [ TSUCHIYA Masatoshi ] + * Initial packaging tasks. + * Remove pipeline configurations for BitBucket. + + [ Kentaro Hayashi ] + * debian/gbp.conf + - Add basic configuration about debian-branch + * debian/watch + - Add missing watch file to detect a new release + * debian/control + - Update deprecated Priority: to optional + - Add Vcs-* fields + - Fix W: sentencepiece: description-synopsis-starts-with-article + - Bump standard version to 4.4.1 + - Update Vcs-* under science-team + - Bump up compatibility level + - Drop python2 support + * debian/copyright + - Use https:// + - Update copyright about third party modules + * debian/rules + - Enable hardening + * debian/salsa-ci.yml + - Add Salsa CI configuration + + -- Kentaro Hayashi Thu, 17 Oct 2019 13:33:34 +0900 diff --git a/clean b/clean new file mode 100644 index 0000000..3dfef3f --- /dev/null +++ b/clean @@ -0,0 +1,2 @@ +python/build/ +python/src/sentencepiece.egg-info/ diff --git a/control b/control new file mode 100644 index 0000000..74023b8 --- /dev/null +++ b/control @@ -0,0 +1,61 @@ +Source: sentencepiece +Section: science +Priority: optional +Maintainer: Debian Science Maintainers +Uploaders: + TSUCHIYA Masatoshi , + Kentaro Hayashi +Build-Depends: + debhelper-compat (= 13), + protobuf-compiler, + libprotobuf-dev, + dh-python, + python3-all-dev, + quilt, + cmake, + python3-setuptools +Standards-Version: 4.6.2 +Homepage: https://github.com/google/sentencepiece +Vcs-Browser: https://salsa.debian.org/science-team/sentencepiece +Vcs-Git: https://salsa.debian.org/science-team/sentencepiece.git +Rules-Requires-Root: no + +Package: sentencepiece +Architecture: any +Depends: ${shlibs:Depends}, ${misc:Depends} +Description: Unsupervised text tokenizer and detokenizer + SentencePiece is an unsupervised text tokenizer/detokenizer mainly + designed for Neural Network-based text generation systems where the + vocabulary size is predetermined prior to the neural model training. + +Package: libsentencepiece0 +Section: libs +Architecture: any +Depends: ${shlibs:Depends}, ${misc:Depends} +Description: Library files of SentencePiece + SentencePiece is an unsupervised text tokenizer/detokenizer mainly + designed for Neural Network-based text generation systems where the + vocabulary size is predetermined prior to the neural model training. + +Package: libsentencepiece-dev +Section: libdevel +Architecture: any +Depends: libsentencepiece0 (= ${binary:Version}), ${misc:Depends} +Description: Header files of SentencePiece + SentencePiece is an unsupervised text tokenizer/detokenizer mainly + designed for Neural Network-based text generation systems where the + vocabulary size is predetermined prior to the neural model training. + +Package: python3-sentencepiece +Section: python +Architecture: any +Depends: + ${shlibs:Depends}, + ${misc:Depends}, + ${python3:Depends} +Description: SentencePiece binding for Python3 + SentencePiece is an unsupervised text tokenizer/detokenizer mainly + designed for Neural Network-based text generation systems where the + vocabulary size is predetermined prior to the neural model training. + . + python3-sentencepiece is its binding for Python3. diff --git a/copyright b/copyright new file mode 100644 index 0000000..17b9239 --- /dev/null +++ b/copyright @@ -0,0 +1,150 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: sentencepiece +Source: https://github.com/google/sentencepiece + +Files: * +Copyright: 2017 Taku Kudo +License: Apache-2.0 + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + . + http://www.apache.org/licenses/LICENSE-2.0 + . + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied. See the License for the specific language governing + permissions and limitations under the License. + +Files: debian/* +Copyright: + 2016 TSUCHIYA Masatoshi + 2019-2022 Kentaro Hayashi +License: GPL-2+ + This package is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + . + This package is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + . + You should have received a copy of the GNU General Public License + along with this program. If not, see + . + On Debian systems, the complete text of the GNU General + Public License version 2 can be found in "/usr/share/common-licenses/GPL-2". + +Files: third_party/esaxx/* +Copyright: 2010 Daisuke Okanohara +License: MIT + +Files: third_party/darts_clone/* +Copyright: 2008-2011, Susumu Yata +License: BSD-3-clause + +Files: third_party/protobuf-lite/* +Copyright: 2008 Google Inc. +License: BSD-3-clause + +Files: data/Scripts.txt +Copyright: 1991-2016 Unicode, Inc. +License: Unicode + COPYRIGHT AND PERMISSION NOTICE + . + Copyright © 1991-2016 Unicode, Inc. All rights reserved. + Distributed under the Terms of Use in https://www.unicode.org/copyright.html. + . + Permission is hereby granted, free of charge, to any person obtaining + a copy of the Unicode data files and any associated documentation + (the "Data Files") or Unicode software and any associated documentation + (the "Software") to deal in the Data Files or Software + without restriction, including without limitation the rights to use, + copy, modify, merge, publish, distribute, and/or sell copies of + the Data Files or Software, and to permit persons to whom the Data Files + or Software are furnished to do so, provided that either + (a) this copyright and permission notice appear with all copies + of the Data Files or Software, or + (b) this copyright and permission notice appear in associated + Documentation. + . + THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF + ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE + WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT OF THIRD PARTY RIGHTS. + IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS + NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL + DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + PERFORMANCE OF THE DATA FILES OR SOFTWARE. + . + Except as contained in this notice, the name of a copyright holder + shall not be used in advertising or otherwise to promote the sale, + use or other dealings in these Data Files or Software without prior + written authorization of the copyright holder. + +Files: data/botchan.txt +Copyright: Kin-nosuke Natsume +License: public-domain + Written by Kin-nosuke Natume and put into the public domain. + It's transalted by Yasotaro Morri and published by Project Gutenberg. + +Files: data/wagahaiwa_nekodearu.txt +Copyright: Kin-nosuke Natsume +License: public-domain + Written by Kin-nosuke Natume and put into the public domain. + It's digitized by Aozora Bunko collabolator and published by Aozora Bunko. + +License: MIT + Permission is hereby granted, free of charge, to any person + obtaining a copy of this software and associated documentation + files (the "Software"), to deal in the Software without + restriction, including without limitation the rights to use, + copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the + Software is furnished to do so, subject to the following + conditions: + . + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + . + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + OTHER DEALINGS IN THE SOFTWARE. + +License: BSD-3-clause + Redistribution and use in source and binary forms, with or without + modificatio n, are permitted provided that the following conditions + are met: + . + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the + distribution. + - Neither the name of the nor the names of its + contributors may be used to endorse or promote products derived + from this software without specific prior written permission. + . + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/gbp.conf b/gbp.conf new file mode 100644 index 0000000..7c93e18 --- /dev/null +++ b/gbp.conf @@ -0,0 +1,3 @@ +[DEFAULT] +debian-branch = master + diff --git a/libsentencepiece-dev.install b/libsentencepiece-dev.install new file mode 100644 index 0000000..b363748 --- /dev/null +++ b/libsentencepiece-dev.install @@ -0,0 +1,3 @@ +usr/lib/*/lib*.so +usr/lib/*/pkgconfig/* +usr/include/* diff --git a/libsentencepiece0.install b/libsentencepiece0.install new file mode 100644 index 0000000..3ddde58 --- /dev/null +++ b/libsentencepiece0.install @@ -0,0 +1 @@ +usr/lib/*/lib*.so.* diff --git a/patches/disable-static-library.patch b/patches/disable-static-library.patch new file mode 100644 index 0000000..203e3ad --- /dev/null +++ b/patches/disable-static-library.patch @@ -0,0 +1,42 @@ +From: Kentaro Hayashi +Date: Sat, 17 Jun 2023 22:47:25 +0900 +Subject: Disable static library explicitly + +Forwarded: not-needed +Bug-Debian: N/A +--- + src/CMakeLists.txt | 11 +---------- + 1 file changed, 1 insertion(+), 10 deletions(-) + +diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt +index 1c7726e..077d37d 100644 +--- a/src/CMakeLists.txt ++++ b/src/CMakeLists.txt +@@ -222,16 +222,10 @@ if (SPM_ENABLE_SHARED) + add_library(sentencepiece_train SHARED ${SPM_TRAIN_SRCS}) + endif() + +-add_library(sentencepiece-static STATIC ${SPM_SRCS}) +-add_library(sentencepiece_train-static STATIC ${SPM_TRAIN_SRCS}) +- +-target_link_libraries(sentencepiece-static INTERFACE ${SPM_LIBS}) +-target_link_libraries(sentencepiece_train-static INTERFACE sentencepiece-static ${SPM_LIBS}) +- + if (SPM_ENABLE_SHARED) + target_link_libraries(sentencepiece ${SPM_LIBS}) + target_link_libraries(sentencepiece_train ${SPM_LIBS} sentencepiece) +- set(SPM_INSTALLTARGETS sentencepiece sentencepiece_train sentencepiece-static sentencepiece_train-static) ++ set(SPM_INSTALLTARGETS sentencepiece sentencepiece_train) + set_target_properties(sentencepiece sentencepiece_train PROPERTIES SOVERSION 0 VERSION 0.0.0) + set_target_properties(sentencepiece PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS YES) + set_target_properties(sentencepiece_train PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS YES) +@@ -248,9 +242,6 @@ else() + set(SPM_INSTALLTARGETS sentencepiece-static sentencepiece_train-static) + endif() + +-set_target_properties(sentencepiece-static PROPERTIES OUTPUT_NAME "sentencepiece") +-set_target_properties(sentencepiece_train-static PROPERTIES OUTPUT_NAME "sentencepiece_train") +- + if (NOT MSVC) + if (SPM_COVERAGE) + set(CMAKE_CXX_FLAGS "-O0 -Wall -fPIC -coverage ${CMAKE_CXX_FLAGS}") diff --git a/patches/fix-ftbfs-big-endian.patch b/patches/fix-ftbfs-big-endian.patch new file mode 100644 index 0000000..3f7db6a --- /dev/null +++ b/patches/fix-ftbfs-big-endian.patch @@ -0,0 +1,155 @@ +From: Kentaro Hayashi +Date: Tue, 20 Jun 2023 17:12:58 +0900 +Subject: Fixes build test errors in big-endian machines + +Author: Taku Kudo +Origin: https://github.com/google/sentencepiece/commit/827591a0c552f2187aac8b8e0f999e8ff31aad81.patch +Forwarded: not-needed +--- + CMakeLists.txt | 5 +++++ + src/CMakeLists.txt | 2 ++ + src/common.h | 10 ++++++---- + src/normalizer.cc | 15 ++++++++------- + src/unigram_model_trainer_test.cc | 5 ++--- + 5 files changed, 23 insertions(+), 14 deletions(-) + +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 1b3af04..a2f0f77 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -33,6 +33,11 @@ option(SPM_NO_THREADLOCAL "Disable thread_local operator" OFF) + option(SPM_USE_BUILTIN_PROTOBUF "Use built-in protobuf" ON) + option(SPM_USE_EXTERNAL_ABSL "Use external abseil" OFF) + option(SPM_ENABLE_MSVC_MT_BUILD, "Use /MT flag in MSVC build" OFF) ++option(SPM_CROSS_SYSTEM_PROCESSOR, "Override system processor" "") ++ ++if (SPM_CROSS_SYSTEM_PROCESSOR) ++ set(CMAKE_SYSTEM_PROCESSOR ${SPM_CROSS_SYSTEM_PROCESSOR}) ++endif() + + # Disable shared build on windows + if(WIN32) +diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt +index 077d37d..09ef57f 100644 +--- a/src/CMakeLists.txt ++++ b/src/CMakeLists.txt +@@ -208,6 +208,7 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR + (${CMAKE_SYSTEM_PROCESSOR} MATCHES "mips") OR + (${CMAKE_SYSTEM_PROCESSOR} MATCHES "m68k") OR + (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc") OR ++ (${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc") OR + (${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch") OR + (${CMAKE_SYSTEM_PROCESSOR} MATCHES "sh4")) + find_library(ATOMIC_LIB NAMES atomic libatomic.so libatomic.so.1) +@@ -217,6 +218,7 @@ if ((${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm") OR + endif() + endif() + ++ + if (SPM_ENABLE_SHARED) + add_library(sentencepiece SHARED ${SPM_SRCS}) + add_library(sentencepiece_train SHARED ${SPM_TRAIN_SRCS}) +diff --git a/src/common.h b/src/common.h +index ef5546d..b38b3f7 100644 +--- a/src/common.h ++++ b/src/common.h +@@ -79,10 +79,6 @@ char (&ArraySizeHelper(const T (&array)[N]))[N]; + #endif + #endif + +-#ifdef IS_BIG_ENDIAN +-inline uint32 Swap32(uint32 x) { return __builtin_bswap32(x); } +-#endif +- + namespace sentencepiece { + #ifdef OS_WIN + namespace win32 { +@@ -90,6 +86,12 @@ std::wstring Utf8ToWide(const absl::string_view input); + } // namespace win32 + #endif + ++#ifdef IS_BIG_ENDIAN ++namespace util { ++inline uint32 Swap32(uint32 x) { return __builtin_bswap32(x); } ++} // namespace util ++#endif ++ + namespace error { + + void Abort(); +diff --git a/src/normalizer.cc b/src/normalizer.cc +index 2ab8084..53e43c4 100644 +--- a/src/normalizer.cc ++++ b/src/normalizer.cc +@@ -260,14 +260,14 @@ std::string Normalizer::EncodePrecompiledCharsMap( + std::string blob; + blob.append(string_util::EncodePOD(trie_blob.size())); + blob.append(trie_blob.data(), trie_blob.size()); +- blob.append(normalized.data(), normalized.size()); + + #ifdef IS_BIG_ENDIAN + uint32 *data = reinterpret_cast(const_cast(blob.data())); +- for (int i = 0; i <= trie_blob.size() / 4; ++i) +- data[i] = util::Swap32(data[i]); ++ for (int i = 0; i < blob.size() / 4; ++i) data[i] = util::Swap32(data[i]); + #endif + ++ blob.append(normalized.data(), normalized.size()); ++ + return blob; + } + +@@ -279,8 +279,7 @@ util::Status Normalizer::DecodePrecompiledCharsMap( + if (blob.size() <= sizeof(trie_blob_size) || + !string_util::DecodePOD( + absl::string_view(blob.data(), sizeof(trie_blob_size)), +- &trie_blob_size) || +- trie_blob_size >= blob.size()) { ++ &trie_blob_size)) { + return util::InternalError("Blob for normalization rule is broken."); + } + +@@ -288,15 +287,17 @@ util::Status Normalizer::DecodePrecompiledCharsMap( + trie_blob_size = util::Swap32(trie_blob_size); + #endif + +- if (trie_blob_size >= blob.size()) ++ if (trie_blob_size >= blob.size()) { + return util::InternalError("Trie data size exceeds the input blob size."); ++ } + + blob.remove_prefix(sizeof(trie_blob_size)); + + #ifdef IS_BIG_ENDIAN ++ CHECK_OR_RETURN(buffer); + buffer->assign(blob.data(), trie_blob_size); + uint32 *data = reinterpret_cast(const_cast(buffer->data())); +- for (int i = 0; i < trie_blob_size / 4; ++i) data[i] = util::Swap32(data[i]); ++ for (int i = 0; i < buffer->size() / 4; ++i) data[i] = util::Swap32(data[i]); + *trie_blob = absl::string_view(buffer->data(), trie_blob_size); + #else + *trie_blob = absl::string_view(blob.data(), trie_blob_size); +diff --git a/src/unigram_model_trainer_test.cc b/src/unigram_model_trainer_test.cc +index 9d2c526..31da90b 100644 +--- a/src/unigram_model_trainer_test.cc ++++ b/src/unigram_model_trainer_test.cc +@@ -106,6 +106,7 @@ TrainerResult RunTrainer(const std::vector& input, int size, + + TrainerResult res; + res.seed_pieces_and_probs = seed_pieces; ++ std::sort(pieces.begin(), pieces.end()); + res.sentence_pieces = absl::StrJoin(pieces, " "); + return res; + } +@@ -119,10 +120,8 @@ TEST(UnigramTrainerTest, BasicTest) { + // Check seed pieces. + EXPECT_EQ(27, res.seed_pieces_and_probs.size()); + +- LOG(INFO) << "[" << res.sentence_pieces << "]"; +- + // Check final pieces. +- EXPECT_EQ("i a n y m l e apple ve O P r g t an v ▁ b A le ▁an p d h", ++ EXPECT_EQ("A O P a an apple b d e g h i l le m n p r t v ve y ▁ ▁an", + res.sentence_pieces); + } + diff --git a/patches/header-dependencies.patch b/patches/header-dependencies.patch new file mode 100644 index 0000000..2823de7 --- /dev/null +++ b/patches/header-dependencies.patch @@ -0,0 +1,27 @@ +From: Kentaro Hayashi +Date: Mon, 21 Nov 2022 22:17:18 +0900 +Subject: Include necessary headers to ensure IS_BIG_ENDIAN is defined + +normalizer.h uses IS_BIG_ENDIAN, which is defined in util.h. +Include util.h here. + +Author: Steve Langasek +Last-Update: 2022-08-27 +Forwarded: no +Bug-Debian: https://bugs.debian.org/1017360 +--- + src/normalizer.h | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/src/normalizer.h b/src/normalizer.h +index c79813c..37fdb8a 100644 +--- a/src/normalizer.h ++++ b/src/normalizer.h +@@ -22,6 +22,7 @@ + #include + + #include "common.h" ++#include "util.h" + #include "sentencepiece_model.pb.h" + #include "sentencepiece_processor.h" + #include "third_party/absl/strings/string_view.h" diff --git a/patches/series b/patches/series new file mode 100644 index 0000000..6a7db4a --- /dev/null +++ b/patches/series @@ -0,0 +1,4 @@ +disable-static-library.patch +support-python-module-in-place.patch +header-dependencies.patch +fix-ftbfs-big-endian.patch diff --git a/patches/support-python-module-in-place.patch b/patches/support-python-module-in-place.patch new file mode 100644 index 0000000..463bec2 --- /dev/null +++ b/patches/support-python-module-in-place.patch @@ -0,0 +1,56 @@ +From: Kentaro Hayashi +Date: Sat, 17 Jun 2023 22:39:14 +0900 +Subject: Support to build Python module without pkg-config + +--- + python/setup.py | 34 ++++++++++++++++++++-------------- + 1 file changed, 20 insertions(+), 14 deletions(-) + +diff --git a/python/setup.py b/python/setup.py +index 5411231..631a8c4 100755 +--- a/python/setup.py ++++ b/python/setup.py +@@ -77,23 +77,29 @@ class build_ext(_build_ext): + """Override build_extension to run cmake.""" + + def build_extension(self, ext): +- cflags, libs = get_cflags_and_libs('../build/root') +- +- if len(libs) == 0: +- if is_sentencepiece_installed(): +- cflags = cflags + run_pkg_config('cflags') +- libs = run_pkg_config('libs') +- else: +- subprocess.check_call(['./build_bundled.sh', __version__]) +- cflags, libs = get_cflags_and_libs('./build/root') ++ # cflags, libs = get_cflags_and_libs('../build/root') ++ # if len(libs) == 0: ++ # cflags, libs = get_cflags_and_libs('./bundled/root') ++ ++ # if len(libs) == 0: ++ # if is_sentencepiece_installed(): ++ # cflags = cflags + run_pkg_config('cflags') ++ # libs = run_pkg_config('libs') ++ # else: ++ # subprocess.check_call(['./build_bundled.sh', __version__]) ++ # cflags, libs = get_cflags_and_libs('./bundled/root') + + # Fix compile on some versions of Mac OSX + # See: https://github.com/neulab/xnmt/issues/199 +- if sys.platform == 'darwin': +- cflags.append('-mmacosx-version-min=10.9') +- else: +- cflags.append('-Wl,-strip-all') +- libs.append('-Wl,-strip-all') ++ # if sys.platform == 'darwin': ++ # cflags.append('-mmacosx-version-min=10.9') ++ # else: ++ # cflags.append('-Wl,-strip-all') ++ # libs.append('-Wl,-strip-all') ++ cflags = ['-I../src'] ++ cmd = "dpkg-architecture -q DEB_BUILD_GNU_TYPE" ++ arch = subprocess.check_output(cmd, shell=True).decode("utf-8").strip().split()[0] ++ libs = ["-L../obj-%s/src" % arch, "-lsentencepiece", "-lsentencepiece_train"] + print('## cflags={}'.format(' '.join(cflags))) + print('## libs={}'.format(' '.join(libs))) + ext.extra_compile_args = cflags diff --git a/python3-sentencepiece.install b/python3-sentencepiece.install new file mode 100644 index 0000000..0cde274 --- /dev/null +++ b/python3-sentencepiece.install @@ -0,0 +1 @@ +usr/lib/python3.*/ diff --git a/rules b/rules new file mode 100755 index 0000000..e0dcf54 --- /dev/null +++ b/rules @@ -0,0 +1,41 @@ +#!/usr/bin/make -f +# -*- makefile -*- +# Sample debian/rules that uses debhelper. +# This file was originally written by Joey Hess and Craig Small. +# As a special exception, when this file is copied by dh-make into a +# dh-make output file, you may use that output file without restriction. +# This special exception was added by Craig Small in version 0.37 of dh-make. + +# Uncomment this to turn on verbose mode. +#export DH_VERBOSE=1 +export DEB_BUILD_MAINT_OPTIONS = hardening=+all +DPKG_EXPORT_BUILDFLAGS = 1 +include /usr/share/dpkg/buildflags.mk + +ifneq (,$(filter $(DEB_HOST_ARCH), armel mipsel m68k powerpc sh4)) + export DEB_LDFLAGS_MAINT_APPEND += -Wl,--no-as-needed -latomic -Wl,--as-needed +endif + +%: + dh $@ --with python3 --buildsystem=cmake + +override_dh_auto_configure: + dh_auto_configure --buildsystem=cmake + dh_auto_configure --sourcedirectory=python --buildsystem=pybuild + +override_dh_auto_build: + dh_auto_build --buildsystem=cmake + dh_auto_build --sourcedirectory=python --buildsystem=pybuild + +override_dh_auto_install: basedir=$(shell pwd)/debian +override_dh_auto_install: + dh_auto_install --buildsystem=cmake + dh_auto_install --sourcedirectory=python --buildsystem=pybuild + +override_dh_auto_clean: + dh_auto_clean --buildsystem=cmake + -rm -rf .pybuild + -rm -rf .python/sentencepiece.egg-info + +# Do no tests. +override_dh_auto_test: diff --git a/salsa-ci.yml b/salsa-ci.yml new file mode 100644 index 0000000..1d8d33b --- /dev/null +++ b/salsa-ci.yml @@ -0,0 +1,7 @@ +--- +include: + - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/salsa-ci.yml + - https://salsa.debian.org/salsa-ci-team/pipeline/raw/master/pipeline-jobs.yml + +reprotest: + allow_failure: true diff --git a/sentencepiece.docs b/sentencepiece.docs new file mode 100644 index 0000000..8d15174 --- /dev/null +++ b/sentencepiece.docs @@ -0,0 +1 @@ +doc/*.md diff --git a/sentencepiece.install b/sentencepiece.install new file mode 100644 index 0000000..1df36c6 --- /dev/null +++ b/sentencepiece.install @@ -0,0 +1 @@ +usr/bin/* diff --git a/sentencepiece.xml b/sentencepiece.xml new file mode 100644 index 0000000..2a81db2 --- /dev/null +++ b/sentencepiece.xml @@ -0,0 +1,291 @@ + +.
will be generated. You may view the +manual page with: nroff -man .
| less'. A typical entry +in a Makefile or Makefile.am is: + +DB2MAN = /usr/share/sgml/docbook/stylesheet/xsl/nwalsh/manpages/docbook.xsl +XP = xsltproc -''-nonet -''-param man.charmap.use.subset "0" + +manpage.1: manpage.xml + $(XP) $(DB2MAN) $< + +The xsltproc binary is found in the xsltproc package. The XSL files are in +docbook-xsl. A description of the parameters you can use can be found in the +docbook-xsl-doc-* packages. Please remember that if you create the nroff +version in one of the debian/rules file targets (such as build), you will need +to include xsltproc and docbook-xsl in your Build-Depends control field. +Alternatively use the xmlto command/package. That will also automatically +pull in xsltproc and docbook-xsl. + +Notes for using docbook2x: docbook2x-man does not automatically create the +AUTHOR(S) and COPYRIGHT sections. In this case, please add them manually as + ... . + +To disable the automatic creation of the AUTHOR(S) and COPYRIGHT sections +read /usr/share/doc/docbook-xsl/doc/manpages/authors.html. This file can be +found in the docbook-xsl-doc-html package. + +Validation can be done using: `xmllint -''-noout -''-valid manpage.xml` + +General documentation about man-pages and man-page-formatting: +man(1), man(7), http://www.tldp.org/HOWTO/Man-Page/ + +--> + + + + + + + + + + + + + +]> + + + + &dhtitle; + &dhpackage; + + + &dhfirstname; + &dhsurname; + Wrote this manpage for the Debian system. +
+ &dhemail; +
+
+
+ + 2007 + &dhusername; + + + This manual page was written for the Debian system + (but may be used by others). + Permission is granted to copy, distribute and/or modify this + document under the terms of the GNU General Public License, + Version 2 or (at your option) any later version published by + the Free Software Foundation. + On Debian systems, the complete text of the GNU General Public + License can be found in + /usr/share/common-licenses/GPL. + +
+ + &dhucpackage; + &dhsection; + + + &dhpackage; + program to do something + + + + &dhpackage; + + + + + + + + + this + + + + + + + + this + that + + + + + &dhpackage; + + + + + + + + + + + + + + + + + + + DESCRIPTION + This manual page documents briefly the + &dhpackage; and bar + commands. + This manual page was written for the Debian distribution + because the original program does not have a manual page. + Instead, it has documentation in the GNU + info + 1 + format; see below. + &dhpackage; is a program that... + + + OPTIONS + The program follows the usual GNU command line syntax, + with long options starting with two dashes (`-'). A summary of + options is included below. For a complete description, see the + + info + 1 + files. + + + + + + + Does this and that. + + + + + + + Show summary of options. + + + + + + + Show version of program. + + + + + + FILES + + + /etc/foo.conf + + The system-wide configuration file to control the + behaviour of &dhpackage;. See + + foo.conf + 5 + for further details. + + + + ${HOME}/.foo.conf + + The per-user configuration file to control the + behaviour of &dhpackage;. See + + foo.conf + 5 + for further details. + + + + + + ENVIONMENT + + + FOO_CONF + + If used, the defined file is used as configuration + file (see also ). + + + + + + DIAGNOSTICS + The following diagnostics may be issued + on stderr: + + + Bad configuration file. Exiting. + + The configuration file seems to contain a broken configuration + line. Use the option, to get more info. + + + + + &dhpackage; provides some return codes, that can + be used in scripts: + + Code + Diagnostic + + 0 + Program exited successfully. + + + 1 + The configuration file seems to be broken. + + + + + + BUGS + The program is currently limited to only work + with the foobar library. + The upstreams BTS can be found + at . + + + SEE ALSO + + + bar + 1 + , + baz + 1 + , + foo.conf + 5 + + The programs are documented fully by The Rise and + Fall of a Fooish Bar available via the + info + 1 + system. + +
+ diff --git a/source/format b/source/format new file mode 100644 index 0000000..163aaf8 --- /dev/null +++ b/source/format @@ -0,0 +1 @@ +3.0 (quilt) diff --git a/tests/control b/tests/control new file mode 100644 index 0000000..ff74c01 --- /dev/null +++ b/tests/control @@ -0,0 +1,2 @@ +Tests: python spm-encode +Depends: libsentencepiece0, sentencepiece, python3-sentencepiece diff --git a/tests/python b/tests/python new file mode 100755 index 0000000..b774114 --- /dev/null +++ b/tests/python @@ -0,0 +1,19 @@ +#!/bin/sh + +set -e + +VERSION=$(dpkg-query --show --showformat='${Version}' sentencepiece | cut -d- -f1) +cat < test_module.py +import sentencepiece as spm + +print('VERSION={}'.format(spm.__version__)) +EOS + +PYVERSION=$(python3 test_module.py) +if [ "VERSION=$VERSION" = "$PYVERSION" ]; then + echo "run python module" +else + echo "Failed to get module version: <${PYVERSION}>" + exit 1 +fi +rm -f test_module.py diff --git a/tests/spm-encode b/tests/spm-encode new file mode 100755 index 0000000..9a2f28e --- /dev/null +++ b/tests/spm-encode @@ -0,0 +1,26 @@ +#!/bin/sh + +set -e + +cat < input.txt +SentencePiece is an unsupervised text tokenizer and detokenizer mainly +for Neural Network-based text generation systems where the vocabulary +size is predetermined prior to the neural model +training. SentencePiece implements subword units (e.g., +byte-pair-encoding (BPE) [Sennrich et al.]) and unigram language model +[Kudo.]) with the extension of direct training from raw +sentences. SentencePiece allows us to make a purely end-to-end system +that does not depend on language-specific pre/postprocessing. +EOS + +rm -f tiny.* +spm_train --input=input.txt --model_prefix=tiny --vocab_size=100 --character_coverage=1.0 >/dev/null 2>&1 +encoded=$(echo "I saw a girl with a telescope." | spm_encode --model=tiny.model) +if [ "▁ I ▁s a w ▁a ▁ g ir l ▁w i t h ▁a ▁t el e s c o p e ." = "${encoded}" ]; then + echo "run spm_encode test" +else + echo "Failed to spm_encode example: <${encoded}>" + exit 1 +fi +rm -f input.txt +rm -f tiny.* diff --git a/watch b/watch new file mode 100644 index 0000000..336e9c8 --- /dev/null +++ b/watch @@ -0,0 +1,4 @@ +version=4 +opts="filenamemangle=s%(?:.*?)?v?(\d[\d.]*)\.tar\.gz%sentencepiece-$1-Source.tar.xz%" \ + https://github.com/google/sentencepiece/tags \ + (?:.*?/)?v(\d[\d.]*)\.tar\.gz debian uupdate